{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install bert-tensorflow==1.0.1\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/toxicity/kaum.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/toxicity/weak-learning-toxicity.json\n",
    "# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/fastformer-base-social-media-2021-11-02.tar.gz\n",
    "# !tar -zxf fastformer-base-social-media-2021-11-02.tar.gz\n",
    "# !rm fastformer-base-social-media-2021-11-02.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "translated-0.json\n",
      "translated-1000000.json\n",
      "translated-1050000.json\n",
      "translated-1100000.json\n",
      "translated-1150000.json\n",
      "translated-1200000.json\n",
      "translated-1450000.json\n",
      "translated-150000.json\n",
      "translated-1500000.json\n",
      "translated-1550000.json\n",
      "translated-1600000.json\n",
      "translated-1650000.json\n",
      "translated-1700000.json\n",
      "translated-1750000.json\n",
      "translated-1800000.json\n",
      "translated-250000.json\n",
      "translated-300000.json\n",
      "translated-350000.json\n",
      "translated-400000.json\n",
      "translated-450000.json\n",
      "translated-50000.json\n",
      "translated-500000.json\n",
      "translated-550000.json\n",
      "translated-600000.json\n",
      "translated-650000.json\n",
      "translated-700000.json\n",
      "translated-750000.json\n",
      "translated-850000.json\n",
      "translated-900000.json\n",
      "translated-950000.json\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "base = 'https://f000.backblazeb2.com/file/malay-dataset/toxicity/'\n",
    "\n",
    "urls = \"\"\"\n",
    "1. translated-0.json\n",
    "2. translated-1000000.json\n",
    "3. translated-1050000.json\n",
    "4. translated-1100000.json\n",
    "5. translated-1150000.json\n",
    "6. translated-1200000.json\n",
    "7. translated-1450000.json\n",
    "8. translated-150000.json\n",
    "9. translated-1500000.json\n",
    "10. translated-1550000.json\n",
    "11. translated-1600000.json\n",
    "12. translated-1650000.json\n",
    "13. translated-1700000.json\n",
    "14. translated-1750000.json\n",
    "15. translated-1800000.json\n",
    "16. translated-250000.json\n",
    "17. translated-300000.json\n",
    "18. translated-350000.json\n",
    "19. translated-400000.json\n",
    "20. translated-450000.json\n",
    "21. translated-50000.json\n",
    "22. translated-500000.json\n",
    "23. translated-550000.json\n",
    "24. translated-600000.json\n",
    "25. translated-650000.json\n",
    "26. translated-700000.json\n",
    "27. translated-750000.json\n",
    "28. translated-850000.json\n",
    "29. translated-900000.json\n",
    "30. translated-950000.json\n",
    "\"\"\"\n",
    "urls = list(filter(None, urls.split('\\n')))\n",
    "urls = [u.split('. ')[1] for u in urls]\n",
    "for u in urls:\n",
    "    print(u)\n",
    "    os.system(f'wget {base}{u}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "normalized_chars = {}\n",
    "\n",
    "chars = '‒–―‐—━—-▬'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = '-'\n",
    "\n",
    "chars = '«»“”¨\"'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = '\"'\n",
    "\n",
    "chars = \"’'ʻˈ´`′‘’\\x92\"\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = \"'\"\n",
    "\n",
    "chars = '̲_'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = '_'\n",
    "\n",
    "chars = '\\xad\\x7f'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = ''\n",
    "\n",
    "chars = '\\n\\r\\t\\u200b\\x96'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = ' '\n",
    "\n",
    "    \n",
    "laughing = {\n",
    "    'huhu',\n",
    "    'haha',\n",
    "    'gagaga',\n",
    "    'hihi',\n",
    "    'wkawka',\n",
    "    'wkwk',\n",
    "    'kiki',\n",
    "    'keke',\n",
    "    'huehue',\n",
    "    'hshs',\n",
    "    'hoho',\n",
    "    'hewhew',\n",
    "    'uwu',\n",
    "    'sksk',\n",
    "    'ksks',\n",
    "    'gituu',\n",
    "    'gitu',\n",
    "    'mmeeooww',\n",
    "    'meow',\n",
    "    'alhamdulillah',\n",
    "    'muah',\n",
    "    'mmuahh',\n",
    "    'hehe',\n",
    "    'salamramadhan',\n",
    "    'happywomensday',\n",
    "    'jahagaha',\n",
    "    'ahakss',\n",
    "    'ahksk'\n",
    "}\n",
    "\n",
    "def make_cleaning(s, c_dict):\n",
    "    s = s.translate(c_dict)\n",
    "    return s\n",
    "\n",
    "def cleaning(string):\n",
    "    \"\"\"\n",
    "    use by any transformer model before tokenization\n",
    "    \"\"\"\n",
    "    string = unidecode(string)\n",
    "    \n",
    "    string = ' '.join(\n",
    "        [make_cleaning(w, normalized_chars) for w in string.split()]\n",
    "    )\n",
    "    string = re.sub('\\(dot\\)', '.', string)\n",
    "    string = (\n",
    "        re.sub(re.findall(r'\\<a(.*?)\\>', string)[0], '', string)\n",
    "        if (len(re.findall(r'\\<a (.*?)\\>', string)) > 0)\n",
    "        and ('href' in re.findall(r'\\<a (.*?)\\>', string)[0])\n",
    "        else string\n",
    "    )\n",
    "    string = re.sub(\n",
    "        r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n",
    "    )\n",
    "    \n",
    "    chars = '.,/'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(r'[ ]+', ' ', string).strip().split()\n",
    "    string = [w for w in string if w[0] != '@']\n",
    "    x = []\n",
    "    for word in string:\n",
    "        word = word.lower()\n",
    "        if any([laugh in word for laugh in laughing]):\n",
    "            if random.random() >= 0.5:\n",
    "                x.append(word)\n",
    "        else:\n",
    "            x.append(word)\n",
    "    string = [w.title() if w[0].isupper() else w for w in x]\n",
    "    return ' '.join(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = \"\"\"\n",
    "1. severe toxic\n",
    "2. obscene\n",
    "3. identity attack\n",
    "4. insult\n",
    "5. threat\n",
    "6. asian\n",
    "7. atheist\n",
    "8. bisexual\n",
    "9. black\n",
    "10. buddhist\n",
    "11. christian\n",
    "12. female\n",
    "13. heterosexual\n",
    "14. indian\n",
    "15. homosexual, gay or lesbian\n",
    "16. intellectual or learning disability\n",
    "17. jewish\n",
    "18. latino\n",
    "19. male\n",
    "20. muslim\n",
    "21. other disability\n",
    "22. other gender\n",
    "23. other race or ethnicity\n",
    "24. other religion\n",
    "25. other sexual orientation\n",
    "26. physical disability\n",
    "27. psychiatric or mental illness\n",
    "28. transgender\n",
    "29. white\n",
    "30. malay\n",
    "31. chinese\n",
    "\"\"\"\n",
    "labels = [l.split('. ')[1].strip() for l in labels.split('\\n') if len(l)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['translated-1750000.json',\n",
       " 'translated-1450000.json',\n",
       " 'translated-700000.json',\n",
       " 'translated-350000.json',\n",
       " 'translated-600000.json',\n",
       " 'translated-900000.json',\n",
       " 'translated-1000000.json',\n",
       " 'translated-1100000.json',\n",
       " 'translated-550000.json',\n",
       " 'translated-150000.json',\n",
       " 'translated-500000.json',\n",
       " 'translated-1500000.json',\n",
       " 'translated-1150000.json',\n",
       " 'translated-750000.json',\n",
       " 'translated-850000.json',\n",
       " 'translated-1650000.json',\n",
       " 'translated-300000.json',\n",
       " 'translated-650000.json',\n",
       " 'translated-950000.json',\n",
       " 'translated-250000.json',\n",
       " 'translated-1600000.json',\n",
       " 'translated-0.json',\n",
       " 'translated-1550000.json',\n",
       " 'translated-1800000.json',\n",
       " 'translated-450000.json',\n",
       " 'translated-50000.json',\n",
       " 'translated-1050000.json',\n",
       " 'translated-1200000.json',\n",
       " 'translated-1700000.json',\n",
       " 'translated-400000.json']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import glob\n",
    "\n",
    "files = glob.glob('translated*')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "translated-1750000.json\n",
      "translated-1450000.json\n",
      "translated-700000.json\n",
      "translated-350000.json\n",
      "translated-600000.json\n",
      "translated-900000.json\n",
      "translated-1000000.json\n",
      "translated-1100000.json\n",
      "translated-550000.json\n",
      "translated-150000.json\n",
      "translated-500000.json\n",
      "translated-1500000.json\n",
      "translated-1150000.json\n",
      "translated-750000.json\n",
      "translated-850000.json\n",
      "translated-1650000.json\n",
      "translated-300000.json\n",
      "translated-650000.json\n",
      "translated-950000.json\n",
      "translated-250000.json\n",
      "translated-1600000.json\n",
      "translated-0.json\n",
      "translated-1550000.json\n",
      "translated-1800000.json\n",
      "translated-450000.json\n",
      "translated-50000.json\n",
      "translated-1050000.json\n",
      "translated-1200000.json\n",
      "translated-1700000.json\n",
      "translated-400000.json\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1401054"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "X, Y = [], []\n",
    "\n",
    "for file in files:\n",
    "    print(file)\n",
    "    with open(file) as fopen:\n",
    "        f = json.load(fopen)\n",
    "        for row in f:\n",
    "            if len(row[1]) == 29:\n",
    "                X.append(row[0])\n",
    "                Y.append(row[1] + [0, 0])\n",
    "        \n",
    "    \n",
    "len(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected_labels = ['black', 'white', 'jewish', 'latino']\n",
    "[labels.index(l) for l in rejected_labels]\n",
    "labels = [l for l in labels if l not in rejected_labels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "ydf = pd.DataFrame(np.array(Y))\n",
    "ydf = ydf.loc[(ydf[8] == 0) & (ydf[28] == 0) & (ydf[16] == 0) & (ydf[17] == 0)]\n",
    "ydf = ydf.drop([8, 28, 16, 17], axis = 1)\n",
    "ix = ydf.index.tolist()\n",
    "Y = ydf.values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1361040, 1361040)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = [X[i] for i in ix]\n",
    "len(X), len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {'severe_toxic': 'severe toxic', 'identity_hate': 'identity attack',\n",
    "          'toxic': 'severe toxic', 'melayu': 'malay', 'cina': 'chinese', 'india': 'indian'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_onehot(tags, depth = len(labels)):\n",
    "    onehot = [0] * depth\n",
    "    for tag in tags:\n",
    "        onehot[labels.index(tag)] = 1\n",
    "    return onehot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "melayu 84851\n",
      "cina 43956\n",
      "india 20208\n"
     ]
    }
   ],
   "source": [
    "with open('kaum.json') as fopen:\n",
    "    kaum = json.load(fopen)\n",
    "    \n",
    "for k, v in kaum.items():\n",
    "    print(k, len(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('weak-learning-toxicity.json') as fopen:\n",
    "    scores = json.load(fopen)\n",
    "    \n",
    "for k, v in scores.items():\n",
    "    for no in range(len(v)):\n",
    "        tags = []\n",
    "        for l, v_ in v[no].items():\n",
    "            if round(v_) == 1:\n",
    "                tags.append(mapping.get(l, l))\n",
    "        tags.append(mapping[k])\n",
    "        Y.append(generate_onehot(tags))\n",
    "        X.append(kaum[k][no])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1510055/1510055 [05:44<00:00, 4380.80it/s] \n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    X[i] = cleaning(X[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1510055/1510055 [00:01<00:00, 1021533.90it/s]\n"
     ]
    }
   ],
   "source": [
    "actual_t, actual_l = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    if len(X[i]) > 2:\n",
    "        actual_t.append(X[i])\n",
    "        actual_l.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from malaya.text.bpe import WordPieceTokenizer\n",
    "\n",
    "tokenizer = WordPieceTokenizer('BERT.wordpiece', do_lower_case = False)\n",
    "# tokenizer.tokenize('halo nama sayacomel')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1502318/1502318 [23:20<00:00, 1073.03it/s]\n"
     ]
    }
   ],
   "source": [
    "input_ids, input_masks = [], []\n",
    "Y = []\n",
    "\n",
    "maxlen = 150\n",
    "for i in tqdm(range(len(actual_t))):\n",
    "    text = actual_t[i]\n",
    "    tokens_a = tokenizer.tokenize(text)\n",
    "    tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n",
    "    input_id = tokenizer.convert_tokens_to_ids(tokens)\n",
    "    input_mask = [1] * len(input_id)\n",
    "    \n",
    "    if len(input_id) <= maxlen:\n",
    "        input_ids.append(input_id)\n",
    "        Y.append(actual_l[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1351180, 1351180)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(input_ids), len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0,\n",
       " 0.0]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('toxicity-fastformer.pkl', 'wb') as fopen:\n",
    "    pickle.dump([input_ids, Y], fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rm: cannot remove '*.json': No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!rm *.json *.zip *.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
